'''
    This program will calculate the average wages by degree-college-major for US
    colleges based on the DOE College Scorecard, using the same major assignment 
    used for the Glassdoor resumes.
'''

print("Starting R_create_US_scorecard.")

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import csv
import statsmodels.api as sm
import statsmodels.formula.api as smf
from datetime import datetime
import gc

fileSeed = "C:/Users/jsock/Dropbox/Research/GD/International"
savePath  = fileSeed + "/Data/"
inputPath = fileSeed + "/InputData/"

def wavg(group,var,weight):
    d = group[var]
    w = group[weight]
    return (d * w).sum() / w.sum()

##############################################################################
# Read in US scorecard and keep only needed data
##############################################################################

scorecard = pd.read_csv(inputPath + 'College_scorecard_field_of_study.csv')

scorecard = scorecard[scorecard.CONTROL!='Foreign']

scorecard = scorecard[['INSTNM','CIPDESC','CREDDESC','EARN_COUNT_WNE_HI_1YR','EARN_COUNT_WNE_HI_2YR',
         'EARN_MDN_HI_1YR','EARN_MDN_HI_2YR']]

##############################################################################
# Clean up scorecard data
##############################################################################

for var in ['EARN_COUNT_WNE_HI_1YR','EARN_COUNT_WNE_HI_2YR','EARN_MDN_HI_1YR','EARN_MDN_HI_2YR']:

    scorecard.loc[scorecard[var]=='PrivacySuppressed',var] = np.nan

    scorecard[var] = scorecard[var].apply(lambda x: float(x))
    
##############################################################################
# Create clean major categories for majoring with GD
##############################################################################

print("Assign majors to groups.")

def assignMajors(df,majorVar,saveVar):

    df[saveVar] = np.nan
    
    df.loc[df[majorVar].isnull(),saveVar] = "missing"

    df[majorVar] = df[majorVar].apply(lambda x: str(x).lstrip().rstrip())
    df[majorVar] = df[majorVar].apply(lambda x: str(x).replace(".",""))
    df[majorVar] = df[majorVar].apply(lambda x: str(x).replace("  "," "))
    df[majorVar] = df[majorVar].apply(lambda x: str(x).title())


    # Arts and Humanities
    df.loc[df[majorVar].apply(lambda x: str(x).count('Art'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('English'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('History'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Journalism'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Language'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Literature'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Music'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Philosophy'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Speech'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Photo'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Acting'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Theater'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Theatre'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Cinema'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Film'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Drama'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Religious'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Religion'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Theology'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Humanities'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Writing'))>0,saveVar] = 'Arts and Humanities'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Spanish'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('French'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Italian'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('German'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Japan'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Chinese'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Russian'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Linguistics'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Liberal Studies'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Painting'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Sculpture'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Dance'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Illustration'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Architect'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Design'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Animation'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Archaeology'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biblical'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Vocal Performance'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Classics'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Bfa'),saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Mfa'),saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Cultural'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Fashion'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Clothing'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Playwrit'))>0,saveVar] = 'Arts and Humanities'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Screenwrit'))>0,saveVar] = 'Arts and Humanities'


    # Biological Sciences
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biochem'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biophysics'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Botany'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Environmental Science'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Marine Science'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Life Science'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Microbiology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bacteriology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Zoology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biological'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Environment'))>0,saveVar] = 'Biological Sciences'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Neuroscience'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Sustainability'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Forestry'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Horticult'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Ecology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Agricult'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biotech'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Psychobiology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Food Science'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Animal Science'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bioinform'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Genetics'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physiology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biomed'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Biostatistics'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Natural Science'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Natural Resources'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Neurobiology'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Agronomy'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Animal'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Atmospheric'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bioscience'))>0,saveVar] = 'Biological Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Plant'))>0,saveVar] = 'Biological Sciences'
    
    # Business
    df.loc[df[majorVar].apply(lambda x: str(x).count('Accountancy'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Accounting'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Advertising'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Business'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Buisness'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Commerce'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Financ'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Marketing'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Management'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Merchandising'))>0,saveVar] = 'Business'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Human Resource'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Actuarial'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Entrepreneur'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bcom'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('BCom'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Real Estate'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Operations'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Supply'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Industrial'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Banking'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Tax'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Strategy'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Organizational Leadership'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Hospitality'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Mba'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Insurance'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Tourism'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Organization'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Human Relations'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Leadership'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Organisation'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Entreprenuer'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Corporate'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Hotel'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Labor Relations'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Employment Relations'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Logistics'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Managerial'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Strategic'))>0,saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Bba'),saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Mis'),saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Hr'),saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Bookkeeping'),saveVar] = 'Business'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Customer Service'),saveVar] = 'Business'


    # Communication
    df.loc[df[majorVar].apply(lambda x: str(x).count('Broadcast'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Communication'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Relations'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Media'))>0,saveVar] = 'Communication'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Multimedia'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Television'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Telecomm'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Audio Production'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Journalism'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Speech'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Media'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Event Planning'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Esl'),saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Publishing'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Video Production'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Translation'))>0,saveVar] = 'Communication'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Visual Effects'))>0,saveVar] = 'Communication'
    

    # Engineering
    df.loc[df[majorVar].apply(lambda x: str(x).count('Engineer'))>0,saveVar] = 'Engineering'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x)=='Ece'),saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Ee'),saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Electronic'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bioengineering'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Mechanical'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Electrical'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Welding'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Aeronautic'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Mech Eng'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Mechatronics'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Materials'))>0,saveVar] = 'Engineering'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Eee'))>0,saveVar] = 'Engineering'


    # Education
    df.loc[df[majorVar].apply(lambda x: str(x).count('Education'))>0,saveVar] = 'Education'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Teach'))>0,saveVar] = 'Education'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Curriculum'))>0,saveVar] = 'Education'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Elementary'))>0,saveVar] = 'Education'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Early Childhood'))>0,saveVar] = 'Education'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Child Development'))>0,saveVar] = 'Education'


    # Physical Sciences
    df.loc[df[majorVar].apply(lambda x: str(x).count('Astronomy'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Meteorology'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Chemistry'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Earth Science'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Geology'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Math'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physics'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Statistics'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physical Science'))>0,saveVar] = 'Physical Sciences'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Computational'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Astrophysics'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Analytics'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Quantitative'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Geoscience'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Geophysics'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Geological'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Geochemistry'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('General Science'))>0,saveVar] = 'Physical Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Science'))>0,saveVar] = 'Physical Sciences'

    
    # Health Professions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health Technology'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Medic'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Dent'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Laboratory'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Veterinar'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Nursing'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Nurse'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Pharmac'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Therapy'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Health'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health Sciences'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Bsn'),saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Rn'),saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Cna'),saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Emt'),saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Lpn'),saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Phlebot'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Immun'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Allied Health'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physician Assistant'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Exercise Science'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Healthcare'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health Care'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health Service'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Mental Health'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Nutrition'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Dietetics'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Epidemiology'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Radiology'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Pre-Med'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Pre-Health'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Pre-Vet'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Premed'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physical Therapist'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Athletic Training'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Behavior Analysis'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Sports and Fitness'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Sports And Fitness'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health and Wellness'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Human Development'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health Studies'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Clinical'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Exercise'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Health'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Hospital Administration'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Kinesiology'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Occupational'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Paramedic'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Personal Train'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physician'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Physio'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Radiography'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Radiologic'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Rehabilitation'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bpharm'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Respiratory Care'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Optometry'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Pediatrics'))>0,saveVar] = 'Health Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Audiology'))>0,saveVar] = 'Health Service'

    # Social Service Professions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Criminal Justice'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Criminology'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Forensics'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Justice'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Urban Planning'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Law'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Military Science'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Legal'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Library'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Archival Science'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Paralegal'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Regional Planning'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Social Work'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Policy'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Urban Planning'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Policy'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Safety'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Administration'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Human Services'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Police'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Forensic'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Counseling'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Fire Science'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Homeland Security'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Affairs'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Juris Doctor'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Socialwork'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Social Service'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Jd'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Criminal'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Social Care'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Public Service'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Welfare'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Jurisprudence'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Human Rights'))>0,saveVar] = 'Social Service'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Museum'))>0,saveVar] = 'Social Service'

    # New additions
    # Social Sciences
    df.loc[df[majorVar].apply(lambda x: str(x).count('Anthropology'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Econom'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Ethnic Studies'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Geography'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Political Science'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Government'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('International Relations'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Psycholog'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Psycolog'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Pyschology'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Social Work'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Sociology'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Gender Studies'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Social Science'))>0,saveVar] = 'Social Sciences'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Politic'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('International'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Women\'S Studies"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Behavioral Science"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("American"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Global"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Cognitive Science"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Asian Studies"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("European Studies"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Urban Studies"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count("Decision Science"))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Foreign'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Family And Consumer Sciences'))>0,saveVar] = 'Social Sciences'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Development Studies'))>0,saveVar] = 'Social Sciences'
    
    # Technology
    df.loc[df[majorVar].apply(lambda x: str(x).count('Data'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Technology'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Computer'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Software'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('System'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Network'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Cyber'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Web'))>0,saveVar] = 'Technology'
    # New additions
    df.loc[df[majorVar].apply(lambda x: str(x).count('Information'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x)=='It'),saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Cs'),saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Cis'),saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x)=='MTech'),saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x)=='Mtech'),saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Cse'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Computing'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('CompSc'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Compsc'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('It Program'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('It Security'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Informatics'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Machine Learning'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('BTech'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Btech'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('MTech'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Bca'))>0,saveVar] = 'Technology'
    df.loc[df[majorVar].apply(lambda x: str(x).count('Mca'))>0,saveVar] = 'Technology'

    # Other
    df.loc[df[saveVar].isnull() & df[majorVar].notnull(),saveVar] = 'Other'

    return df

scorecard = assignMajors(scorecard,'CIPDESC','grpMajor')
    
##############################################################################
# Create clean degree categories for merging with GD
##############################################################################
    
scorecard.loc[scorecard['CREDDESC']=="Associate's Degree",'degreeScorecard'] = 'ASSOCIATES'

scorecard.loc[scorecard['CREDDESC']=="Bachelors Degree",'degreeScorecard'] = 'BACHELORS'

scorecard.loc[scorecard['CREDDESC']=="Master's Degree",'degreeScorecard'] = 'MASTERS'

scorecard.loc[scorecard['CREDDESC']=="Doctoral Degree",'degreeScorecard'] = 'PHD'

scorecard.loc[scorecard['CREDDESC']=="Graduate/Professional Certificate",'degreeScorecard'] = 'POSTGRAD'

scorecard.loc[scorecard['CREDDESC']=="Post-baccalaureate Certificate",'degreeScorecard'] = 'POSTGRAD'

scorecard.loc[scorecard['CREDDESC']=="Doctoral Degree",'degreeScorecard'] = 'PHD'

scorecard.loc[scorecard['CREDDESC']=="Undergraduate Certificate or Diploma",'degreeScorecard'] = 'DIPLOMA'

##############################################################################
# Calculate average median earnings within SCHOOL-MAJOR-DEGREE
##############################################################################

median_one_year = scorecard[scorecard['EARN_COUNT_WNE_HI_1YR'].notnull()].groupby(['INSTNM','degreeScorecard','grpMajor']).apply(wavg,'EARN_MDN_HI_1YR','EARN_COUNT_WNE_HI_1YR').reset_index()
median_one_year.rename(columns={0:'median_1year_scorecard'},inplace=True)

count_one_year = scorecard[scorecard['EARN_COUNT_WNE_HI_1YR'].notnull()].groupby(['INSTNM','degreeScorecard','grpMajor']).EARN_COUNT_WNE_HI_1YR.sum().reset_index()
count_one_year.rename(columns={'EARN_COUNT_WNE_HI_1YR':'count_1year_scorecard'},inplace=True)

median_two_year = scorecard[scorecard['EARN_COUNT_WNE_HI_2YR'].notnull()].groupby(['INSTNM','degreeScorecard','grpMajor']).apply(wavg,'EARN_MDN_HI_2YR','EARN_COUNT_WNE_HI_2YR').reset_index()
median_two_year.rename(columns={0:'median_2year_scorecard'},inplace=True)

count_two_year = scorecard[scorecard['EARN_COUNT_WNE_HI_2YR'].notnull()].groupby(['INSTNM','degreeScorecard','grpMajor']).EARN_COUNT_WNE_HI_2YR.sum().reset_index()
count_two_year.rename(columns={'EARN_COUNT_WNE_HI_2YR':'count_2year_scorecard'},inplace=True)

##############################################################################
# Merge and save results
##############################################################################

scorecard_data = pd.merge(median_one_year, median_two_year, how='left', left_on = ['INSTNM','degreeScorecard','grpMajor'], right_on = ['INSTNM','degreeScorecard','grpMajor'])
scorecard_data = pd.merge(scorecard_data, count_one_year, how='left', left_on = ['INSTNM','degreeScorecard','grpMajor'], right_on = ['INSTNM','degreeScorecard','grpMajor'])
scorecard_data = pd.merge(scorecard_data, count_two_year, how='left', left_on = ['INSTNM','degreeScorecard','grpMajor'], right_on = ['INSTNM','degreeScorecard','grpMajor'])

scorecard_data.rename(columns={'INSTNM':'school'},inplace=True)

scorecard_data.to_csv(savePath + "US_scorecard_earnings.csv")

print("Finished R_create_US_scorecard.")








